In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Importing dataset
df = pd.read_csv("diabetes.csv")
print(df.head())
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \ 0 6 148 72 35 0 33.6 1 1 85 66 29 0 26.6 2 8 183 64 0 0 23.3 3 1 89 66 23 94 28.1 4 0 137 40 35 168 43.1 DiabetesPedigreeFunction Age Outcome 0 0.627 50 1 1 0.351 31 0 2 0.672 32 1 3 0.167 21 0 4 2.288 33 1
In [4]:
print(df.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 768 entries, 0 to 767 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Pregnancies 768 non-null int64 1 Glucose 768 non-null int64 2 BloodPressure 768 non-null int64 3 SkinThickness 768 non-null int64 4 Insulin 768 non-null int64 5 BMI 768 non-null float64 6 DiabetesPedigreeFunction 768 non-null float64 7 Age 768 non-null int64 8 Outcome 768 non-null int64 dtypes: float64(2), int64(7) memory usage: 54.1 KB None
In [18]:
print(df.describe)
<bound method NDFrame.describe of Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \
0 6 148 72 35 0 33.6
1 1 85 66 29 0 26.6
2 8 183 64 0 0 23.3
3 1 89 66 23 94 28.1
4 0 137 40 35 168 43.1
.. ... ... ... ... ... ...
763 10 101 76 48 180 32.9
764 2 122 70 27 0 36.8
765 5 121 72 23 112 26.2
766 1 126 60 0 0 30.1
767 1 93 70 31 0 30.4
DiabetesPedigreeFunction Age Outcome
0 0.627 50 1
1 0.351 31 0
2 0.672 32 1
3 0.167 21 0
4 2.288 33 1
.. ... ... ...
763 0.171 63 0
764 0.340 27 0
765 0.245 30 0
766 0.349 47 1
767 0.315 23 0
[768 rows x 9 columns]>
In [6]:
print(df.isnull().sum())
Pregnancies 0 Glucose 0 BloodPressure 0 SkinThickness 0 Insulin 0 BMI 0 DiabetesPedigreeFunction 0 Age 0 Outcome 0 dtype: int64
In [7]:
# Summary statistics for the dataset
summary_statistics = df.describe()
print(summary_statistics)
Pregnancies Glucose BloodPressure SkinThickness Insulin \
count 768.000000 768.000000 768.000000 768.000000 768.000000
mean 3.845052 120.894531 69.105469 20.536458 79.799479
std 3.369578 31.972618 19.355807 15.952218 115.244002
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 1.000000 99.000000 62.000000 0.000000 0.000000
50% 3.000000 117.000000 72.000000 23.000000 30.500000
75% 6.000000 140.250000 80.000000 32.000000 127.250000
max 17.000000 199.000000 122.000000 99.000000 846.000000
BMI DiabetesPedigreeFunction Age Outcome
count 768.000000 768.000000 768.000000 768.000000
mean 31.992578 0.471876 33.240885 0.348958
std 7.884160 0.331329 11.760232 0.476951
min 0.000000 0.078000 21.000000 0.000000
25% 27.300000 0.243750 24.000000 0.000000
50% 32.000000 0.372500 29.000000 0.000000
75% 36.600000 0.626250 41.000000 1.000000
max 67.100000 2.420000 81.000000 1.000000
In [10]:
# Measures of central tendency
mean_values = df.mean()
median_values = df.median()
mode_values = df.mode().iloc[0]
# Measures of dispersion
std_dev_values = df.std()
variance_values = df.var()
range_values = df.max() - df.min()
# Display these measures
print("Mean Values:\n", mean_values)
Mean Values: Pregnancies 3.845052 Glucose 120.894531 BloodPressure 69.105469 SkinThickness 20.536458 Insulin 79.799479 BMI 31.992578 DiabetesPedigreeFunction 0.471876 Age 33.240885 Outcome 0.348958 dtype: float64
In [11]:
print("Median Values:\n", median_values)
Median Values: Pregnancies 3.0000 Glucose 117.0000 BloodPressure 72.0000 SkinThickness 23.0000 Insulin 30.5000 BMI 32.0000 DiabetesPedigreeFunction 0.3725 Age 29.0000 Outcome 0.0000 dtype: float64
In [12]:
print("Mode Values:\n", mode_values)
Mode Values: Pregnancies 1.000 Glucose 99.000 BloodPressure 70.000 SkinThickness 0.000 Insulin 0.000 BMI 32.000 DiabetesPedigreeFunction 0.254 Age 22.000 Outcome 0.000 Name: 0, dtype: float64
In [13]:
print("Standard Deviation Values:\n", std_dev_values)
Standard Deviation Values: Pregnancies 3.369578 Glucose 31.972618 BloodPressure 19.355807 SkinThickness 15.952218 Insulin 115.244002 BMI 7.884160 DiabetesPedigreeFunction 0.331329 Age 11.760232 Outcome 0.476951 dtype: float64
In [14]:
print("Variance Values:\n", variance_values)
Variance Values: Pregnancies 11.354056 Glucose 1022.248314 BloodPressure 374.647271 SkinThickness 254.473245 Insulin 13281.180078 BMI 62.159984 DiabetesPedigreeFunction 0.109779 Age 138.303046 Outcome 0.227483 dtype: float64
In [15]:
print("Range Values:\n", range_values)
Range Values: Pregnancies 17.000 Glucose 199.000 BloodPressure 122.000 SkinThickness 99.000 Insulin 846.000 BMI 67.100 DiabetesPedigreeFunction 2.342 Age 60.000 Outcome 1.000 dtype: float64
In [16]:
# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
# outliers for each column
outliers = ((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR)))
print(outliers.sum())
Pregnancies 4 Glucose 5 BloodPressure 45 SkinThickness 1 Insulin 34 BMI 19 DiabetesPedigreeFunction 29 Age 9 Outcome 0 dtype: int64
In [17]:
# Exploring missing values
df.isnull().sum()
Out[17]:
Pregnancies 0 Glucose 0 BloodPressure 0 SkinThickness 0 Insulin 0 BMI 0 DiabetesPedigreeFunction 0 Age 0 Outcome 0 dtype: int64
In [3]:
# Show top 5 rows
df.head(20)
Out[3]:
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
| 1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
| 2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
| 3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
| 4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
| 5 | 5 | 116 | 74 | 0 | 0 | 25.6 | 0.201 | 30 | 0 |
| 6 | 3 | 78 | 50 | 32 | 88 | 31.0 | 0.248 | 26 | 1 |
| 7 | 10 | 115 | 0 | 0 | 0 | 35.3 | 0.134 | 29 | 0 |
| 8 | 2 | 197 | 70 | 45 | 543 | 30.5 | 0.158 | 53 | 1 |
| 9 | 8 | 125 | 96 | 0 | 0 | 0.0 | 0.232 | 54 | 1 |
| 10 | 4 | 110 | 92 | 0 | 0 | 37.6 | 0.191 | 30 | 0 |
| 11 | 10 | 168 | 74 | 0 | 0 | 38.0 | 0.537 | 34 | 1 |
| 12 | 10 | 139 | 80 | 0 | 0 | 27.1 | 1.441 | 57 | 0 |
| 13 | 1 | 189 | 60 | 23 | 846 | 30.1 | 0.398 | 59 | 1 |
| 14 | 5 | 166 | 72 | 19 | 175 | 25.8 | 0.587 | 51 | 1 |
| 15 | 7 | 100 | 0 | 0 | 0 | 30.0 | 0.484 | 32 | 1 |
| 16 | 0 | 118 | 84 | 47 | 230 | 45.8 | 0.551 | 31 | 1 |
| 17 | 7 | 107 | 74 | 0 | 0 | 29.6 | 0.254 | 31 | 1 |
| 18 | 1 | 103 | 30 | 38 | 83 | 43.3 | 0.183 | 33 | 0 |
| 19 | 1 | 115 | 70 | 30 | 96 | 34.6 | 0.529 | 32 | 1 |
In [19]:
corr = df.corr()
sns.heatmap(corr, annot=True, square=True)
plt.show()
In [14]:
df.groupby('Outcome').mean()
Out[14]:
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | |
|---|---|---|---|---|---|---|---|---|
| Outcome | ||||||||
| 0 | 3.298000 | 109.980000 | 68.184000 | 19.664000 | 68.792000 | 30.304200 | 0.429734 | 31.190000 |
| 1 | 4.865672 | 141.257463 | 70.824627 | 22.164179 | 100.335821 | 35.142537 | 0.550500 | 37.067164 |
In [16]:
df.groupby('Outcome').mean().T.plot(figsize=(12,4))
Out[16]:
<Axes: >
In [18]:
sns.countplot(x='Outcome',data=df)
Out[18]:
<Axes: xlabel='Outcome', ylabel='count'>
In [29]:
# Box plots to identify outliers
plt.figure(figsize=(10, 5))
sns.boxplot(df)
plt.xticks(rotation=45)
plt.show()
In [30]:
# Define a function to remove outliers based on the IQR method
def remove_outliers(df, columns):
for column in columns:
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
return df
# List of columns from which to remove outliers
columns_to_check = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
# Remove outliers
df_clean = remove_outliers(df, columns_to_check)
# Display the shape of the dataset before and after removing outliers
print("Original dataset shape:", df.shape)
print("Cleaned dataset shape:", df_clean.shape)
Original dataset shape: (768, 9) Cleaned dataset shape: (636, 9)
In [32]:
# Box plots to identify outliers
plt.figure(figsize=(10, 5))
sns.boxplot(df_clean)
plt.xticks(rotation=45)
plt.show()
In [43]:
sns.pairplot(data=df_clean,kind='scatter')
plt.savefig('output.png') # Save that figure
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
In [44]:
sns.pairplot(data=df_clean,hue='Outcome')
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
C:\Users\chaud\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
Out[44]:
<seaborn.axisgrid.PairGrid at 0x25e3f5ecc90>
In [46]:
sns.scatterplot(x='Glucose',y='Insulin',data=df_clean)
plt.show()
In [19]:
diabetes = df[df['Outcome'] == 1]
no_diabetes = df[df['Outcome'] == 0]
# Display the first few rows of each group
diabetes.head(), no_diabetes.head()
Out[19]:
( Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \
0 6 148 72 35 0 33.6
2 8 183 64 0 0 23.3
4 0 137 40 35 168 43.1
6 3 78 50 32 88 31.0
8 2 197 70 45 543 30.5
DiabetesPedigreeFunction Age Outcome
0 0.627 50 1
2 0.672 32 1
4 2.288 33 1
6 0.248 26 1
8 0.158 53 1 ,
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \
1 1 85 66 29 0 26.6
3 1 89 66 23 94 28.1
5 5 116 74 0 0 25.6
7 10 115 0 0 0 35.3
10 4 110 92 0 0 37.6
DiabetesPedigreeFunction Age Outcome
1 0.351 31 0
3 0.167 21 0
5 0.201 30 0
7 0.134 29 0
10 0.191 30 0 )
In [21]:
import pandas as pd
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline
# Extract glucose levels for both groups
glucose_with_diabetes = diabetes['Glucose']
glucose_without_diabetes = no_diabetes['Glucose']
# Perform an independent t-test
t_stat, p_value = stats.ttest_ind(glucose_with_diabetes, glucose_without_diabetes)
t_stat, p_value
Out[21]:
(14.600060005973894, 8.935431645289913e-43)
In [22]:
# Extract BMI for both groups
bmi_with_diabetes = diabetes['BMI']
bmi_without_diabetes = no_diabetes['BMI']
# Perform an independent t-test
t_stat_bmi, p_value_bmi = stats.ttest_ind(bmi_with_diabetes, bmi_without_diabetes)
t_stat_bmi, p_value_bmi
Out[22]:
(8.47183994786525, 1.2298074873116022e-16)
In [23]:
# Extract age for both groups
age_with_diabetes = diabetes['Age']
age_without_diabetes = no_diabetes['Age']
# Perform an independent t-test
t_stat_age, p_value_age = stats.ttest_ind(age_with_diabetes, age_without_diabetes)
t_stat_age, p_value_age
Out[23]:
(6.792688071649956, 2.2099754606654358e-11)
In [24]:
# Extract the number of pregnancies for both groups
preg_with_diabetes = diabetes['Pregnancies']
preg_without_diabetes = no_diabetes['Pregnancies']
# Perform an independent t-test
t_stat_preg, p_value_preg = stats.ttest_ind(preg_with_diabetes, preg_without_diabetes)
print(f'Number of Pregnancies - t-statistic: {t_stat_preg}, p-value: {p_value_preg}')
Number of Pregnancies - t-statistic: 6.298430550035151, p-value: 5.065127298053476e-10